library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(faraway)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
# Read in the data
redwines <- read.csv("redwines.csv")
head(redwines)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.4 0.70 0.00 1.9 0.076
## 2 7.8 0.88 0.00 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.70 0.00 1.9 0.076
## 6 7.4 0.66 0.00 1.8 0.075
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 11 34 0.9978 3.51 0.56 9.4
## 2 25 67 0.9968 3.20 0.68 9.8
## 3 15 54 0.9970 3.26 0.65 9.8
## 4 17 60 0.9980 3.16 0.58 9.8
## 5 11 34 0.9978 3.51 0.56 9.4
## 6 13 40 0.9978 3.51 0.56 9.4
## quality
## 1 5
## 2 5
## 3 5
## 4 6
## 5 5
## 6 5
pairs(redwines[,-c(11,12)])
full.model <- lm(alcohol ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides + free.sulfur.dioxide+ total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
summary(full.model)
##
## Call:
## lm(formula = alcohol ~ fixed.acidity + volatile.acidity + citric.acid +
## residual.sugar + chlorides + free.sulfur.dioxide + total.sulfur.dioxide +
## density + pH + sulphates, data = redwines[, -c(12)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.07175 -0.39267 -0.04056 0.35396 2.44365
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.072e+02 1.308e+01 46.419 < 2e-16 ***
## fixed.acidity 5.324e-01 2.064e-02 25.796 < 2e-16 ***
## volatile.acidity 3.608e-01 1.144e-01 3.154 0.001638 **
## citric.acid 8.306e-01 1.379e-01 6.024 2.11e-09 ***
## residual.sugar 2.844e-01 1.229e-02 23.135 < 2e-16 ***
## chlorides -1.462e+00 3.956e-01 -3.696 0.000227 ***
## free.sulfur.dioxide -2.143e-03 2.057e-03 -1.042 0.297517
## total.sulfur.dioxide -2.296e-03 6.881e-04 -3.336 0.000868 ***
## density -6.174e+02 1.342e+01 -45.998 < 2e-16 ***
## pH 3.762e+00 1.551e-01 24.263 < 2e-16 ***
## sulphates 1.247e+00 1.037e-01 12.020 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.614 on 1588 degrees of freedom
## Multiple R-squared: 0.6701, Adjusted R-squared: 0.668
## F-statistic: 322.5 on 10 and 1588 DF, p-value: < 2.2e-16
round(cor(redwines[,-c(11,12)]),dig=2)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## fixed.acidity 1.00 -0.26 0.67 0.11
## volatile.acidity -0.26 1.00 -0.55 0.00
## citric.acid 0.67 -0.55 1.00 0.14
## residual.sugar 0.11 0.00 0.14 1.00
## chlorides 0.09 0.06 0.20 0.06
## free.sulfur.dioxide -0.15 -0.01 -0.06 0.19
## total.sulfur.dioxide -0.11 0.08 0.04 0.20
## density 0.67 0.02 0.36 0.36
## pH -0.68 0.23 -0.54 -0.09
## sulphates 0.18 -0.26 0.31 0.01
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## fixed.acidity 0.09 -0.15 -0.11 0.67
## volatile.acidity 0.06 -0.01 0.08 0.02
## citric.acid 0.20 -0.06 0.04 0.36
## residual.sugar 0.06 0.19 0.20 0.36
## chlorides 1.00 0.01 0.05 0.20
## free.sulfur.dioxide 0.01 1.00 0.67 -0.02
## total.sulfur.dioxide 0.05 0.67 1.00 0.07
## density 0.20 -0.02 0.07 1.00
## pH -0.27 0.07 -0.07 -0.34
## sulphates 0.37 0.05 0.04 0.15
## pH sulphates
## fixed.acidity -0.68 0.18
## volatile.acidity 0.23 -0.26
## citric.acid -0.54 0.31
## residual.sugar -0.09 0.01
## chlorides -0.27 0.37
## free.sulfur.dioxide 0.07 0.05
## total.sulfur.dioxide -0.07 0.04
## density -0.34 0.15
## pH 1.00 -0.20
## sulphates -0.20 1.00
We observe \(fixed.acidity\) is mildly correlated with \(citric.acid\), \(density\) and \(pH\); \(citric.acid\) is mildly correlated with \(fixed.acidity\), \(volatile.acidity\) and \(pH\); \(free.sulfur.dioxide\) is mildly correlated with \(total.sulfur.dioxide\). We consider removing \(fixed.acidity\), \(citric.acid\) and \(free.sulfur.dioxide\). Let’s look at \(free.sulfur.dioxide\) firstly,
model1 <- lm(alcohol ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides + total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
summary(model1)
##
## Call:
## lm(formula = alcohol ~ fixed.acidity + volatile.acidity + citric.acid +
## residual.sugar + chlorides + total.sulfur.dioxide + density +
## pH + sulphates, data = redwines[, -c(12)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.06145 -0.39706 -0.03917 0.34928 2.44848
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.059e+02 1.302e+01 46.535 < 2e-16 ***
## fixed.acidity 5.300e-01 2.051e-02 25.846 < 2e-16 ***
## volatile.acidity 3.809e-01 1.128e-01 3.377 0.000749 ***
## citric.acid 8.548e-01 1.359e-01 6.289 4.12e-10 ***
## residual.sugar 2.827e-01 1.219e-02 23.198 < 2e-16 ***
## chlorides -1.487e+00 3.949e-01 -3.766 0.000172 ***
## total.sulfur.dioxide -2.775e-03 5.123e-04 -5.416 7.02e-08 ***
## density -6.160e+02 1.335e+01 -46.125 < 2e-16 ***
## pH 3.739e+00 1.534e-01 24.369 < 2e-16 ***
## sulphates 1.242e+00 1.036e-01 11.984 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.614 on 1589 degrees of freedom
## Multiple R-squared: 0.6699, Adjusted R-squared: 0.668
## F-statistic: 358.2 on 9 and 1589 DF, p-value: < 2.2e-16
Great, \(total.sulfur.dioxide\)’s p-vlaue decreases significantly.
anova(model1, full.model)
## Analysis of Variance Table
##
## Model 1: alcohol ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar +
## chlorides + total.sulfur.dioxide + density + pH + sulphates
## Model 2: alcohol ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar +
## chlorides + free.sulfur.dioxide + total.sulfur.dioxide +
## density + pH + sulphates
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1589 599.11
## 2 1588 598.70 1 0.40944 1.086 0.2975
Since F is large, we can drop the \(free.sulfur.dioxide\).
x = model.matrix(model1)[,-1]
dim(x)
## [1] 1599 9
x = x-matrix(apply(x,2,mean),1599,9, byrow=TRUE)
x = x/matrix(apply(x,2,sd),1599,9, byrow=TRUE)
apply(x,2,mean)
## fixed.acidity volatile.acidity citric.acid
## 3.518207e-16 1.841869e-16 -9.207575e-17
## residual.sugar chlorides total.sulfur.dioxide
## -1.156003e-16 8.888973e-17 4.302269e-17
## density pH sulphates
## 2.365840e-14 -2.488013e-17 2.009076e-17
apply(x,2,sd)
## fixed.acidity volatile.acidity citric.acid
## 1 1 1
## residual.sugar chlorides total.sulfur.dioxide
## 1 1 1
## density pH sulphates
## 1 1 1
e = eigen(t(x) %*% x)
sqrt(e$val[1]/e$val[9])
## [1] 5.262965
condition number is 5.26 lower than 30, there is no collinearity now.
Then we will use model1 in the following. ## Detect Unusual Observations ### Check High Leverage Points
n=dim(redwines)[1]
p=10
lev=influence(model1)$hat
highlev=lev[lev>2*p/n] #find high leverage points
highlev
## 14 18 20 34 39 43 46
## 0.02545301 0.02737606 0.02204734 0.02383297 0.01514653 0.02155009 0.01326053
## 82 84 87 92 93 95 96
## 0.04430844 0.03069371 0.05546544 0.05546544 0.05719765 0.01359738 0.01480742
## 107 110 127 128 143 145 152
## 0.04486258 0.01332999 0.01843085 0.01837401 0.01259396 0.01259396 0.09406530
## 162 170 182 199 227 241 244
## 0.01318846 0.03632233 0.01351370 0.01260013 0.03158334 0.01274861 0.02022239
## 245 259 282 292 325 326 340
## 0.02022239 0.08429367 0.02774343 0.03931076 0.02792361 0.02792361 0.01614457
## 355 379 397 401 416 443 452
## 0.01907500 0.01419726 0.01501122 0.01501122 0.01586589 0.01494878 0.03338733
## 464 481 495 516 554 555 556
## 0.01822883 0.06188658 0.01607714 0.01788008 0.02096122 0.01526251 0.01526251
## 558 592 615 640 650 652 653
## 0.01568960 0.01635403 0.02503975 0.01551114 0.02214230 0.01553989 0.04606766
## 673 685 691 693 696 701 711
## 0.02384996 0.01515557 0.01560221 0.03336782 0.01561996 0.01271687 0.01284710
## 724 731 755 796 837 838 862
## 0.04381317 0.03310494 0.03504952 0.01663087 0.01493434 0.01493434 0.03910420
## 890 912 918 924 1018 1019 1044
## 0.01262159 0.01897597 0.01622861 0.01622861 0.02527432 0.02527432 0.01598607
## 1052 1072 1075 1078 1079 1080 1082
## 0.03402287 0.01353322 0.01353322 0.01294298 0.01294298 0.05381562 0.05682189
## 1115 1132 1166 1187 1229 1236 1245
## 0.01921261 0.01312761 0.02530741 0.01546896 0.01266515 0.04301469 0.04763002
## 1261 1271 1289 1290 1300 1313 1317
## 0.03297166 0.01351017 0.01670309 0.01670309 0.03341683 0.01460558 0.02260574
## 1320 1322 1368 1371 1373 1375 1435
## 0.03897217 0.02066571 0.01296512 0.03548240 0.03548240 0.01763324 0.05831006
## 1436 1475 1477 1509 1559 1571 1575
## 0.05831006 0.04381646 0.04381646 0.01361479 0.01592611 0.01330012 0.06009596
## 1590
## 0.01271003
halfnorm(lev,6, labs=as.character(1:length(highlev)), ylab="Leverages")
### Check Outliers
StuR=rstudent(model1)
qt(0.05/(2*n),n-p-1)
## [1] -4.176071
sort(abs(StuR),decreasing=TRUE)[1:10]
## 652 560 565 396 354 557 559 494
## 4.038199 4.018534 4.018534 3.952545 3.858127 3.694477 3.694477 3.670237
## 500 268
## 3.670237 3.455404
No outliers
cook=cooks.distance(model1)
max(cook)
## [1] 0.07210158
halfnorm(cook,6,labs=as.character(1:length(cook)),ylab="Cook's distances")
We have some high leverage points, but none of them are high influential. We don’t have outliers.
plot(model1,which=1)
bptest(model1)
##
## studentized Breusch-Pagan test
##
## data: model1
## BP = 169.31, df = 9, p-value < 2.2e-16
Homoscedasticity doesn’t hold
plot(model1,which=2)
hist(model1$residuals)
ks.test(residuals(model1), y=pnorm)
## Warning in ks.test(residuals(model1), y = pnorm): ties should not be present for
## the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: residuals(model1)
## D = 0.14307, p-value < 2.2e-16
## alternative hypothesis: two-sided
Normality doesn’t hold.
redwines_clean = redwines[-c(481, 1575),]
model_clean = lm(alcohol ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides + total.sulfur.dioxide + density + pH + sulphates, data=redwines_clean[,-c(12)])
summary(model_clean)
##
## Call:
## lm(formula = alcohol ~ fixed.acidity + volatile.acidity + citric.acid +
## residual.sugar + chlorides + total.sulfur.dioxide + density +
## pH + sulphates, data = redwines_clean[, -c(12)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.07939 -0.39264 -0.04045 0.34956 2.48010
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.060e+02 1.294e+01 46.839 < 2e-16 ***
## fixed.acidity 5.241e-01 2.043e-02 25.649 < 2e-16 ***
## volatile.acidity 3.704e-01 1.122e-01 3.302 0.000982 ***
## citric.acid 8.863e-01 1.359e-01 6.521 9.34e-11 ***
## residual.sugar 3.002e-01 1.266e-02 23.712 < 2e-16 ***
## chlorides -1.540e+00 3.925e-01 -3.924 9.07e-05 ***
## total.sulfur.dioxide -2.935e-03 5.102e-04 -5.752 1.06e-08 ***
## density -6.160e+02 1.327e+01 -46.419 < 2e-16 ***
## pH 3.717e+00 1.525e-01 24.370 < 2e-16 ***
## sulphates 1.232e+00 1.030e-01 11.962 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6101 on 1587 degrees of freedom
## Multiple R-squared: 0.6742, Adjusted R-squared: 0.6724
## F-statistic: 365 on 9 and 1587 DF, p-value: < 2.2e-16
plot(model_clean,which=1)
bptest(model_clean)
##
## studentized Breusch-Pagan test
##
## data: model_clean
## BP = 164.35, df = 9, p-value < 2.2e-16
plot(model_clean,which=2)
ks.test(residuals(model_clean), y=pnorm)
## Warning in ks.test(residuals(model_clean), y = pnorm): ties should not be
## present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: residuals(model_clean)
## D = 0.14385, p-value < 2.2e-16
## alternative hypothesis: two-sided
min(redwines$alcohol)
## [1] 8.4
Make sure all alchohol values are positive.
model.transformation=boxcox(model1,lambda=seq(-2, -1, by=0.01))
model.transformation$x[model.transformation$y==max(model.transformation$y)]
## [1] -1.53
\(\lambda\) is -1.53
model_bx=lm((alcohol^(-1.53)-1)/(-1.53) ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides + total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
summary(model_bx)
##
## Call:
## lm(formula = (alcohol^(-1.53) - 1)/(-1.53) ~ fixed.acidity +
## volatile.acidity + citric.acid + residual.sugar + chlorides +
## total.sulfur.dioxide + density + pH + sulphates, data = redwines[,
## -c(12)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0056555 -0.0010128 -0.0000063 0.0009974 0.0064010
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.076e+00 3.340e-02 62.155 < 2e-16 ***
## fixed.acidity 1.304e-03 5.260e-05 24.785 < 2e-16 ***
## volatile.acidity 9.833e-04 2.893e-04 3.399 0.000692 ***
## citric.acid 2.060e-03 3.487e-04 5.908 4.23e-09 ***
## residual.sugar 6.800e-04 3.126e-05 21.751 < 2e-16 ***
## chlorides -4.670e-03 1.013e-03 -4.611 4.33e-06 ***
## total.sulfur.dioxide -8.396e-06 1.314e-06 -6.389 2.18e-10 ***
## density -1.491e+00 3.426e-02 -43.529 < 2e-16 ***
## pH 9.233e-03 3.936e-04 23.461 < 2e-16 ***
## sulphates 3.181e-03 2.658e-04 11.970 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.001575 on 1589 degrees of freedom
## Multiple R-squared: 0.6509, Adjusted R-squared: 0.649
## F-statistic: 329.2 on 9 and 1589 DF, p-value: < 2.2e-16
All predictors are significant. We don’t need to move any predictors. Now let’s check unusual observations and assumptions again.
lev_new=influence(model_bx)$hat
highlev_new=lev_new[lev_new>2*p/n] #find high leverage points
highlev_new
## 14 18 20 34 39 43 46
## 0.02545301 0.02737606 0.02204734 0.02383297 0.01514653 0.02155009 0.01326053
## 82 84 87 92 93 95 96
## 0.04430844 0.03069371 0.05546544 0.05546544 0.05719765 0.01359738 0.01480742
## 107 110 127 128 143 145 152
## 0.04486258 0.01332999 0.01843085 0.01837401 0.01259396 0.01259396 0.09406530
## 162 170 182 199 227 241 244
## 0.01318846 0.03632233 0.01351370 0.01260013 0.03158334 0.01274861 0.02022239
## 245 259 282 292 325 326 340
## 0.02022239 0.08429367 0.02774343 0.03931076 0.02792361 0.02792361 0.01614457
## 355 379 397 401 416 443 452
## 0.01907500 0.01419726 0.01501122 0.01501122 0.01586589 0.01494878 0.03338733
## 464 481 495 516 554 555 556
## 0.01822883 0.06188658 0.01607714 0.01788008 0.02096122 0.01526251 0.01526251
## 558 592 615 640 650 652 653
## 0.01568960 0.01635403 0.02503975 0.01551114 0.02214230 0.01553989 0.04606766
## 673 685 691 693 696 701 711
## 0.02384996 0.01515557 0.01560221 0.03336782 0.01561996 0.01271687 0.01284710
## 724 731 755 796 837 838 862
## 0.04381317 0.03310494 0.03504952 0.01663087 0.01493434 0.01493434 0.03910420
## 890 912 918 924 1018 1019 1044
## 0.01262159 0.01897597 0.01622861 0.01622861 0.02527432 0.02527432 0.01598607
## 1052 1072 1075 1078 1079 1080 1082
## 0.03402287 0.01353322 0.01353322 0.01294298 0.01294298 0.05381562 0.05682189
## 1115 1132 1166 1187 1229 1236 1245
## 0.01921261 0.01312761 0.02530741 0.01546896 0.01266515 0.04301469 0.04763002
## 1261 1271 1289 1290 1300 1313 1317
## 0.03297166 0.01351017 0.01670309 0.01670309 0.03341683 0.01460558 0.02260574
## 1320 1322 1368 1371 1373 1375 1435
## 0.03897217 0.02066571 0.01296512 0.03548240 0.03548240 0.01763324 0.05831006
## 1436 1475 1477 1509 1559 1571 1575
## 0.05831006 0.04381646 0.04381646 0.01361479 0.01592611 0.01330012 0.06009596
## 1590
## 0.01271003
halfnorm(lev_new,6, labs=as.character(1:length(highlev_new)), ylab="Leverages")
### Check Outliers
StuR_new=rstudent(model_bx)
qt(0.05/(2*n),n-p-1)
## [1] -4.176071
sort(abs(StuR_new),decreasing=TRUE)[1:10]
## 652 244 245 494 500 557 559 560
## 4.116362 3.641438 3.641438 3.605038 3.605038 3.605006 3.605006 3.445001
## 565 372
## 3.445001 3.432353
No outliers
cook_new=cooks.distance(model_bx)
max(cook_new)
## [1] 0.07234619
halfnorm(cook_new,6,labs=as.character(1:length(cook_new)),ylab="Cook's distances")
We have some high leverage points, but none of them are high influential. We don’t have outliers.
plot(model_bx,which=1)
bptest(model_bx)
##
## studentized Breusch-Pagan test
##
## data: model_bx
## BP = 184.18, df = 9, p-value < 2.2e-16
Homoscedasticity doesn’t hold
plot(model_bx,which=2)
hist(model_bx$residuals)
ks.test(residuals(model_bx), y=pnorm)
## Warning in ks.test(residuals(model_bx), y = pnorm): ties should not be present
## for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: residuals(model_bx)
## D = 0.49774, p-value < 2.2e-16
## alternative hypothesis: two-sided
Normality doesn’t hold. Box-Cox Transformation can’t fix the violation to assumptions.
#fixed.acidity
modela1 = lm(alcohol ~ volatile.acidity + citric.acid + residual.sugar + chlorides + total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
modela2 = lm(fixed.acidity ~ volatile.acidity + citric.acid + residual.sugar + chlorides + total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
reg.a = lm(modela1$residuals ~ modela2$residuals)
#volatile.acidity
modelb1 = lm(alcohol ~ fixed.acidity + citric.acid + residual.sugar + chlorides + total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
modelb2 = lm(volatile.acidity ~ fixed.acidity + citric.acid + residual.sugar + chlorides + total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
reg.b = lm(modelb1$residuals ~ modelb2$residuals)
#citric.acid
modelc1 = lm(alcohol ~ fixed.acidity + volatile.acidity + residual.sugar + chlorides + total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
modelc2 = lm(citric.acid ~ fixed.acidity + volatile.acidity + residual.sugar + chlorides + total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
reg.c = lm(modelc1$residuals ~ modelc2$residuals)
#residual.sugar
modeld1 = lm(alcohol ~ fixed.acidity + volatile.acidity + citric.acid + chlorides + total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
modeld2 = lm(residual.sugar ~ fixed.acidity + volatile.acidity + citric.acid + chlorides + total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
reg.d = lm(modeld1$residuals ~ modeld2$residuals)
#chlorides
modele1 = lm(alcohol ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
modele2 = lm(chlorides ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
reg.e = lm(modele1$residuals ~ modele2$residuals)
#total.sulfur.dioxide
modelf1 = lm(alcohol ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides + density + pH + sulphates, data=redwines[,-c(12)])
modelf2 = lm(total.sulfur.dioxide ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides + density + pH + sulphates, data=redwines[,-c(12)])
reg.f = lm(modelf1$residuals ~ modelf2$residuals)
#density
modelg1 = lm(alcohol ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides + total.sulfur.dioxide + pH + sulphates, data=redwines[,-c(12)])
modelg2 = lm(density ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides + total.sulfur.dioxide + pH + sulphates, data=redwines[,-c(12)])
reg.g = lm(modelg1$residuals ~ modelg2$residuals)
#pH
modelh1 = lm(alcohol ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides + total.sulfur.dioxide + density + sulphates, data=redwines[,-c(12)])
modelh2 = lm(pH ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides + total.sulfur.dioxide + density + sulphates, data=redwines[,-c(12)])
reg.h = lm(modelh1$residuals ~ modelh2$residuals)
#sulphates
modeli1 = lm(alcohol ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides + total.sulfur.dioxide + density + pH, data=redwines[,-c(12)])
modeli2 = lm(sulphates ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides + total.sulfur.dioxide + density + pH, data=redwines[,-c(12)])
reg.i = lm(modeli1$residuals ~ modeli2$residuals)
plot(modela2$residuals, modela1$residuals)
abline(reg.a, col="red")
plot(modelb2$residuals, modelb1$residuals)
abline(reg.b, col="red")
plot(modelc2$residuals, modelc1$residuals)
abline(reg.c, col="red")
plot(modeld2$residuals, modeld1$residuals)
abline(reg.d, col="red")
plot(modele2$residuals, modele1$residuals)
abline(reg.e, col="red")
plot(modelf2$residuals, modelf1$residuals)
abline(reg.f, col="red")
plot(modelg2$residuals, modelg1$residuals)
abline(reg.g, col="red")
plot(modelh2$residuals, modelh1$residuals)
abline(reg.h, col="red")
plot(modeli2$residuals, modeli1$residuals)
abline(reg.i, col="red")
All plots except the \(residual.sugar\), we try to transform the \(residual.sugar\).
#residual.sugar
modeld1 = lm(log(alcohol) ~ fixed.acidity + volatile.acidity + citric.acid + chlorides + total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
modeld2 = lm(log(residual.sugar) ~ fixed.acidity + volatile.acidity + citric.acid + chlorides + total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
reg.d = lm(modeld1$residuals ~ modeld2$residuals)
plot(modeld2$residuals, modeld1$residuals)
abline(reg.d, col="red")
We can find that \(\log({\text{residual.sugar}})\) works much better. Then, Let’s try the new model!
modelN=lm(log(alcohol) ~ fixed.acidity + volatile.acidity + citric.acid + log(residual.sugar) + chlorides + total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
summary(modelN)
##
## Call:
## lm(formula = log(alcohol) ~ fixed.acidity + volatile.acidity +
## citric.acid + log(residual.sugar) + chlorides + total.sulfur.dioxide +
## density + pH + sulphates, data = redwines[, -c(12)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.21937 -0.03359 -0.00364 0.03292 0.24246
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.974e+01 1.146e+00 52.119 < 2e-16 ***
## fixed.acidity 4.882e-02 1.776e-03 27.480 < 2e-16 ***
## volatile.acidity 2.479e-02 9.840e-03 2.519 0.011862 *
## citric.acid 6.561e-02 1.188e-02 5.521 3.93e-08 ***
## log(residual.sugar) 1.238e-01 4.292e-03 28.834 < 2e-16 ***
## chlorides -1.238e-01 3.448e-02 -3.591 0.000339 ***
## total.sulfur.dioxide -3.102e-04 4.470e-05 -6.939 5.73e-12 ***
## density -5.930e+01 1.175e+00 -50.477 < 2e-16 ***
## pH 3.355e-01 1.335e-02 25.128 < 2e-16 ***
## sulphates 1.169e-01 9.030e-03 12.941 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05355 on 1589 degrees of freedom
## Multiple R-squared: 0.7085, Adjusted R-squared: 0.7069
## F-statistic: 429.1 on 9 and 1589 DF, p-value: < 2.2e-16
\(volatile.acidity\) isn’t significant. Let’s try to remove it!
modelN2=lm(log(alcohol) ~ fixed.acidity + citric.acid + log(residual.sugar) + chlorides + total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
summary(modelN2)
##
## Call:
## lm(formula = log(alcohol) ~ fixed.acidity + citric.acid + log(residual.sugar) +
## chlorides + total.sulfur.dioxide + density + pH + sulphates,
## data = redwines[, -c(12)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.220352 -0.033302 -0.003927 0.032990 0.246753
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.943e+01 1.141e+00 52.064 < 2e-16 ***
## fixed.acidity 4.927e-02 1.770e-03 27.831 < 2e-16 ***
## citric.acid 4.999e-02 1.015e-02 4.923 9.42e-07 ***
## log(residual.sugar) 1.241e-01 4.296e-03 28.895 < 2e-16 ***
## chlorides -1.017e-01 3.340e-02 -3.044 0.00237 **
## total.sulfur.dioxide -2.959e-04 4.441e-05 -6.662 3.72e-11 ***
## density -5.898e+01 1.170e+00 -50.415 < 2e-16 ***
## pH 3.374e-01 1.335e-02 25.275 < 2e-16 ***
## sulphates 1.122e-01 8.854e-03 12.673 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05364 on 1590 degrees of freedom
## Multiple R-squared: 0.7073, Adjusted R-squared: 0.7059
## F-statistic: 480.4 on 8 and 1590 DF, p-value: < 2.2e-16
Now let’s check unusual observations and assumptions again. ## Check Unusual Observations ### Check High Leverages
p=9
levN=influence(modelN2)$hat
highlevN=levN[levN>2*p/n] #find high leverage points
highlevN
## 14 15 16 18 20 34 43
## 0.02374118 0.01131942 0.01178307 0.02724947 0.02097420 0.01425524 0.02038424
## 46 80 82 84 87 89 92
## 0.01325381 0.01305942 0.04422410 0.03070217 0.05467641 0.01196870 0.05467641
## 93 96 107 152 162 170 182
## 0.05618973 0.01473502 0.04488286 0.09288328 0.01367020 0.03578282 0.01327085
## 202 227 241 244 245 259 282
## 0.01183720 0.03071777 0.01264097 0.01720205 0.01720205 0.08369923 0.02796887
## 292 325 326 340 341 355 397
## 0.03220335 0.01616639 0.01616639 0.01612463 0.01130387 0.01842680 0.01334713
## 401 416 443 452 464 481 495
## 0.01334713 0.01537383 0.01257489 0.03219085 0.01425742 0.02350069 0.01303660
## 516 555 556 558 592 615 640
## 0.01671361 0.01521687 0.01521687 0.01565473 0.01640204 0.02320278 0.01552666
## 650 652 653 693 696 724 731
## 0.01925028 0.01462366 0.04313287 0.03300083 0.01549102 0.04359091 0.03301774
## 755 796 834 837 838 862 912
## 0.03514777 0.01144328 0.01259379 0.01488624 0.01488624 0.01783962 0.01375028
## 918 924 942 1018 1019 1044 1052
## 0.01264128 0.01264128 0.01162118 0.02534206 0.02534206 0.01134274 0.03472209
## 1072 1075 1078 1079 1080 1082 1115
## 0.01127439 0.01127439 0.01319684 0.01319684 0.05208184 0.05507926 0.02084965
## 1166 1187 1227 1229 1236 1245 1261
## 0.02508957 0.01321362 0.01192691 0.01183504 0.02194849 0.02310269 0.03172148
## 1270 1271 1289 1290 1317 1320 1322
## 0.01234367 0.01245992 0.01627316 0.01627316 0.02253381 0.03522612 0.02066344
## 1368 1371 1373 1375 1404 1435 1436
## 0.01237472 0.03388543 0.03388543 0.01839175 0.01184521 0.02385416 0.02385416
## 1471 1475 1477 1509 1559 1571 1575
## 0.01258353 0.02005344 0.02005344 0.01357720 0.01639033 0.01347953 0.03627797
halfnorm(levN,6, labs=as.character(1:length(highlevN)), ylab="Leverages")
### Check Outliers
StuRN=rstudent(modelN2)
qt(0.05/(2*n),n-p-1)
## [1] -4.176063
sort(abs(StuRN),decreasing=TRUE)[1:10]
## 652 511 494 500 244 245 560 565
## 4.664595 4.148651 3.907091 3.907091 3.812396 3.812396 3.810034 3.810034
## 410 354
## 3.733976 3.720389
#652 is an outlier.
cookN=cooks.distance(modelN2)
max(cookN)
## [1] 0.03541655
halfnorm(cookN,6,labs=as.character(1:length(cookN)),ylab="Cook's distances")
We have some high leverage points and one outlier, but none of them are high influential.
plot(modelN2,which=1)
bptest(modelN2)
##
## studentized Breusch-Pagan test
##
## data: modelN2
## BP = 134.14, df = 8, p-value < 2.2e-16
Homoscedasticity doesn’t hold
plot(modelN2,which=2)
hist(modelN2$residuals)
ks.test(residuals(modelN2), y=pnorm)
## Warning in ks.test(residuals(modelN2), y = pnorm): ties should not be present
## for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: residuals(modelN2)
## D = 0.44287, p-value < 2.2e-16
## alternative hypothesis: two-sided
Normality doesn’t hold. Still can’t fix the violation to assumptions.
We will use Generalized Least Squares next.
lm.resid=lm(abs(modelN2$residuals)~ fixed.acidity + citric.acid + log(residual.sugar) + chlorides + total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
summary(lm.resid)
##
## Call:
## lm(formula = abs(modelN2$residuals) ~ fixed.acidity + citric.acid +
## log(residual.sugar) + chlorides + total.sulfur.dioxide +
## density + pH + sulphates, data = redwines[, -c(12)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.063461 -0.023523 -0.006196 0.016038 0.194110
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.933e-02 7.160e-01 0.125 0.900725
## fixed.acidity 5.539e-03 1.110e-03 4.988 6.75e-07 ***
## citric.acid 2.473e-03 6.369e-03 0.388 0.697830
## log(residual.sugar) 9.709e-03 2.695e-03 3.603 0.000325 ***
## chlorides -2.636e-02 2.095e-02 -1.258 0.208466
## total.sulfur.dioxide 1.251e-05 2.786e-05 0.449 0.653363
## density -2.079e-01 7.338e-01 -0.283 0.777003
## pH 3.057e-02 8.374e-03 3.651 0.000270 ***
## sulphates 6.166e-03 5.554e-03 1.110 0.267046
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03364 on 1590 degrees of freedom
## Multiple R-squared: 0.06416, Adjusted R-squared: 0.05945
## F-statistic: 13.63 on 8 and 1590 DF, p-value: < 2.2e-16
redwines$weight = 1/lm.resid$fitted.values^2
head(redwines)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.4 0.70 0.00 1.9 0.076
## 2 7.8 0.88 0.00 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.70 0.00 1.9 0.076
## 6 7.4 0.66 0.00 1.8 0.075
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 11 34 0.9978 3.51 0.56 9.4
## 2 25 67 0.9968 3.20 0.68 9.8
## 3 15 54 0.9970 3.26 0.65 9.8
## 4 17 60 0.9980 3.16 0.58 9.8
## 5 11 34 0.9978 3.51 0.56 9.4
## 6 13 40 0.9978 3.51 0.56 9.4
## quality weight
## 1 5 680.4699
## 2 5 821.0768
## 3 5 797.5032
## 4 6 392.0977
## 5 5 680.4699
## 6 5 695.7573
ModelGLS = lm(log(alcohol) ~ fixed.acidity + citric.acid + log(residual.sugar) + chlorides + total.sulfur.dioxide + density + pH + sulphates, data=redwines, weights=weight)
summary(ModelGLS)
##
## Call:
## lm(formula = log(alcohol) ~ fixed.acidity + citric.acid + log(residual.sugar) +
## chlorides + total.sulfur.dioxide + density + pH + sulphates,
## data = redwines, weights = weight)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -4.1487 -0.8459 -0.0803 0.7985 4.9741
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.324e+01 1.040e+00 60.788 < 2e-16 ***
## fixed.acidity 5.170e-02 1.736e-03 29.781 < 2e-16 ***
## citric.acid 4.506e-02 9.327e-03 4.831 1.49e-06 ***
## log(residual.sugar) 1.258e-01 4.370e-03 28.784 < 2e-16 ***
## chlorides -1.114e-01 2.447e-02 -4.552 5.72e-06 ***
## total.sulfur.dioxide -2.429e-04 4.076e-05 -5.959 3.12e-09 ***
## density -6.283e+01 1.067e+00 -58.909 < 2e-16 ***
## pH 3.355e-01 1.196e-02 28.057 < 2e-16 ***
## sulphates 1.201e-01 8.004e-03 15.004 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.288 on 1590 degrees of freedom
## Multiple R-squared: 0.7581, Adjusted R-squared: 0.7569
## F-statistic: 623 on 8 and 1590 DF, p-value: < 2.2e-16
p=9
lev=influence(ModelGLS)$hat
highlev=lev[lev>2*p/n] #find high leverage points
highlev
## 14 18 20 35 39 43 46
## 0.02406689 0.03006787 0.02615795 0.02197022 0.01584188 0.02060839 0.01306117
## 50 80 82 84 87 92 93
## 0.03057213 0.01480537 0.04985504 0.04472050 0.05556373 0.05556373 0.05514656
## 95 96 107 143 145 146 148
## 0.01377799 0.01419120 0.06185484 0.01315654 0.01315654 0.01173450 0.01501330
## 152 162 170 199 202 227 231
## 0.09704776 0.02279010 0.06431675 0.01678839 0.01127107 0.02461942 0.01228704
## 241 243 259 282 292 355 391
## 0.01212927 0.01360380 0.18483615 0.01999847 0.01599239 0.03899989 0.01293439
## 397 401 440 452 464 554 589
## 0.01151761 0.01151761 0.01968041 0.03747609 0.01798278 0.01364680 0.01235031
## 592 615 640 650 693 696 724
## 0.03424956 0.02587062 0.01214951 0.01915306 0.03219727 0.01662860 0.06696009
## 731 755 803 822 837 838 862
## 0.02353455 0.05093519 0.01457065 0.01224451 0.01794220 0.01794220 0.01686589
## 917 1018 1019 1052 1080 1082 1086
## 0.01441685 0.08863633 0.08863633 0.04277325 0.03560511 0.03729104 0.01507573
## 1099 1115 1127 1132 1158 1166 1227
## 0.01134655 0.02341700 0.01426647 0.01780662 0.01344471 0.02613311 0.01822012
## 1229 1236 1241 1245 1261 1270 1271
## 0.01441523 0.01966429 0.01129933 0.02027381 0.03196594 0.01726420 0.01644981
## 1289 1290 1296 1297 1299 1317 1320
## 0.01307448 0.01307448 0.01256181 0.01256181 0.01143870 0.02007992 0.03801925
## 1322 1368 1371 1373 1375 1390 1393
## 0.02058252 0.01474052 0.03750807 0.03750807 0.04764642 0.01880954 0.01587503
## 1404 1457 1471 1476 1478 1483 1509
## 0.01604786 0.01205987 0.01813317 0.01265119 0.01265119 0.01291716 0.01512627
## 1559 1567 1571 1575 1590
## 0.01795038 0.01255547 0.01490928 0.02931215 0.01177936
halfnorm(lev,6, labs=as.character(1:length(highlev)), ylab="Leverages")
### Check Outliers
StuR=rstudent(ModelGLS)
qt(0.05/(2*n),n-p-1)
## [1] -4.176063
sort(abs(StuR),decreasing=TRUE)[1:10]
## 1018 1019 1427 652 494 500 654 727
## 4.063682 4.063682 3.881344 3.777157 3.353329 3.353329 3.309287 3.243646
## 1234 372
## 3.234305 3.190007
No ouliers
cook=cooks.distance(ModelGLS)
sort(abs(cook),decreasing=TRUE)[1:10]
## 1018 1019 152 227 837 838 259
## 0.17672589 0.17672589 0.02577125 0.01996817 0.01929981 0.01929981 0.01859494
## 1320 1115 1132
## 0.01721969 0.01494359 0.01463477
halfnorm(cook,6,labs=as.character(1:length(cook)),ylab="Cook's distances")
We have some high leverage points, but none of them are high influential. No outliers.
plot(ModelGLS,which=1)
bptest(ModelGLS)
##
## studentized Breusch-Pagan test
##
## data: ModelGLS
## BP = 134.14, df = 8, p-value < 2.2e-16
Homoscedasticity doesn’t hold
plot(ModelGLS,which=2)
hist(ModelGLS$residuals)
ks.test(residuals(ModelGLS), y=pnorm)
## Warning in ks.test(residuals(ModelGLS), y = pnorm): ties should not be present
## for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: residuals(ModelGLS)
## D = 0.44265, p-value < 2.2e-16
## alternative hypothesis: two-sided
Still can’t fix the violation to assumptions.